package org.sjcx.jsoup;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

/**
 * Example program to list links from a URL.
 */
public class gethml {
	static int num = 0;
	public static void main(String[] args) {
		new gethml().getUrl("http://www.biqiuge.com/");
	}

	HttpURLConnection connection = null;
	
	public void getUrl(String url){
		print("Fetching %s...", url);
		Document doc;
		try {
			URL hrefUrl = new URL( url );
			connection = (HttpURLConnection)hrefUrl.openConnection();
			//默认就是Get，可以采用post，大小写都行，因为源码里都toUpperCase了。
			connection.setRequestMethod("GET");
			//是否允许缓存，默认true。
			connection.setUseCaches(Boolean.FALSE);
			//是否开启输出输入，如果是post使用true。默认是false
			//connection.setDoOutput(Boolean.TRUE);
			//connection.setDoInput(Boolean.TRUE);
			//设置请求头信息
			connection.addRequestProperty("Connection", "close");
			//设置连接主机超时（单位：毫秒）  
			connection.setConnectTimeout(8000);  
			//设置从主机读取数据超时（单位：毫秒）  
			connection.setReadTimeout(8000);    
			//设置Cookie
//			connection.addRequestProperty("Cookie","你的Cookies" );
			//开始请求
			doc = Jsoup.parse(connection.getInputStream(), "GBK", url);

			Elements links = doc.select("a[href]");

			print("\nLinks: (%d)", links.size());

			for (int i = 0; i < links.size(); i++) {
				String url1 = links.get(i).attr("abs:href");
				if(url1 != null && url1 != "" && url1.startsWith("http") && !url1.endsWith(".apk")) {
					int status = getHtml( links.get(i).attr("abs:href") );

					if( status == -1){
						getHtml(links.get(++i).attr("abs:href"));
					}
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}

	}


	public int getHtml(String _link){
		print("Fetching %s...", _link);

		Document doc;
		try {
			if(getHttpResponseCode( _link ) != 200) {
				return -1;
			}
			URL url_link = new URL( _link );
			connection = (HttpURLConnection)url_link.openConnection();
			//默认就是Get，可以采用post，大小写都行，因为源码里都toUpperCase了。
			connection.setRequestMethod("GET");
			//是否允许缓存，默认true。
			connection.setUseCaches(Boolean.FALSE);
			//是否开启输出输入，如果是post使用true。默认是false
			//connection.setDoOutput(Boolean.TRUE);
			//connection.setDoInput(Boolean.TRUE);
			//设置请求头信息
			connection.addRequestProperty("Connection", "close");
			//设置连接主机超时（单位：毫秒）  
			connection.setConnectTimeout(8000);  
			//设置从主机读取数据超时（单位：毫秒）  
			connection.setReadTimeout(8000);    
			//设置Cookie
//			connection.addRequestProperty("Cookie","你的Cookies" );
			//开始请求
			doc = Jsoup.parse(connection.getInputStream(), "GBK", _link);

			Elements links = doc.select("a[href]");

			print("\nLinks: (%d)", links.size());
			for (Element link : links) {

				String url = link.attr("abs:href");
				if(url != null && url != "" && url.startsWith("http") && !url.endsWith(".apk")) {
					
					System.out.println(trim(link.text(), 50));
					
//					print(" * a: <%s>", link.attr("abs:href"), trim(link.text(), 35));
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return 0;
	}

	private static void print(String msg, Object... args) {
		System.out.println(String.format(msg, args));
	}

	private static String trim(String s, int width) {
		if (s.length() > width)
			return s.substring(0, width-1) + ".";
		else
			return s;
	}

	public static int getHttpResponseCode(String _link) throws MalformedURLException {
		URL url = new URL( _link );
		HttpURLConnection httpurlconnection = null;
		int responsecode = -1;
		try {
			URLConnection urlconnection = url.openConnection();
			urlconnection.connect();
			if (!(urlconnection instanceof HttpURLConnection)) {
				// urlconnection.disconnect();
				return responsecode;
			}

			httpurlconnection = (HttpURLConnection) urlconnection;

			// 获取返回码,通过responsecode 就可以知道网页的状态,我们也是通过此字段用于判断请求的资源是否存在
			responsecode= httpurlconnection.getResponseCode();
			switch (responsecode) {
			// here valid codes!
			case HttpURLConnection.HTTP_OK:
			case HttpURLConnection.HTTP_MOVED_PERM:
			case HttpURLConnection.HTTP_MOVED_TEMP:
				break;
			default:
				httpurlconnection.disconnect();
			}
		} catch (Exception ioexception) {
			if (httpurlconnection != null) {
				httpurlconnection.disconnect();
			}
			return responsecode;
		}
		return responsecode;
	}
}
